In [1]:
%matplotlib inline

import gzip
import json
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict, Counter


/home/entity/anaconda2/lib/python2.7/site-packages/matplotlib/font_manager.py:279: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [2]:
%%time
data = []
media_types = defaultdict(int)
url_types = defaultdict(int)
unique_urls = set()
with gzip.open("all_ids.txt.json.gz") as fp:
    for line in fp:
        d = json.loads(line.strip())
        data.append(d)
        if 'entities' not in d:
            continue
        if 'media' in d['entities']:
            m_entities = d['entities']['media']
            for m in m_entities:
                m_type = m['type']
                media_types[m_type] += 1
        if 'urls' in d['entities']:
            m_entities = d['entities']['urls']
            for m in m_entities:
                media_types['url'] += 1
                m = m['expanded_url']
                m_type = m.split("/", 3)[2]
                unique_urls.add((m, m_type))
                url_types[m_type] += 1
                
print(media_types)
url_types = Counter(url_types)
print(len(url_types), len(unique_urls))


defaultdict(<type 'int'>, {'url': 166670, u'photo': 27682})
(8750, 119558)
CPU times: user 1min 14s, sys: 8.02 s, total: 1min 22s
Wall time: 1min 22s

In [3]:
url_types.most_common(50)


Out[3]:
[(u'twitter.com', 24978),
 (u'bit.ly', 15148),
 (u'fb.me', 15069),
 (u'ow.ly', 6866),
 (u'dlvr.it', 5398),
 (u'ift.tt', 4693),
 (u'goo.gl', 4039),
 (u'ln.is', 3795),
 (u'youtu.be', 3784),
 (u'gvwy.io', 3120),
 (u'www.instagram.com', 2761),
 (u'buff.ly', 2331),
 (u'www.newsweek.com', 1949),
 (u'www.youtube.com', 1170),
 (u'nyti.ms', 1119),
 (u'tinyurl.com', 1083),
 (u'wp.me', 1048),
 (u'm.tbnn.it', 960),
 (u'shar.es', 845),
 (u'www.naturalnews.com', 798),
 (u'warontherocks.com', 739),
 (u'truthinmedia.com', 677),
 (u'cnn.it', 670),
 (u'rover.ebay.com', 604),
 (u'dld.bz', 524),
 (u'www.periscope.tv', 515),
 (u'lnkd.in', 504),
 (u'www.huffingtonpost.com', 486),
 (u'b.autovist.com', 473),
 (u'fxn.ws', 468),
 (u'www.breitbart.com', 461),
 (u'www.facebook.com', 421),
 (u'www.nytimes.com', 416),
 (u'n.pr', 405),
 (u'www.infowars.com', 404),
 (u'a.msn.com', 397),
 (u'thefederalist.com', 385),
 (u'apple.news', 379),
 (u'go.shr.lc', 378),
 (u'NaturalNews.com', 373),
 (u'www.foxnews.com', 362),
 (u'wpo.st', 350),
 (u'pinterest.com', 346),
 (u'www.cnn.com', 325),
 (u'www.yahoo.com', 319),
 (u'amzn.to', 317),
 (u'on.mash.to', 316),
 (u'wapo.st', 314),
 (u'brev.is', 310),
 (u'j.mp', 305)]

In [4]:
sorted(unique_urls,
                      key=lambda x: url_types[x[1]],
                     reverse=True)[:10]


Out[4]:
[(u'https://twitter.com/i/web/status/787248028335808513', u'twitter.com'),
 (u'https://twitter.com/mr_dsantos/status/792410135582875648', u'twitter.com'),
 (u'https://twitter.com/i/web/status/789400744810024960', u'twitter.com'),
 (u'https://twitter.com/candy_lass/status/692590229069254656', u'twitter.com'),
 (u'https://twitter.com/i/web/status/791387309992280064', u'twitter.com'),
 (u'https://twitter.com/_ijmtybx/status/743864533089947648', u'twitter.com'),
 (u'https://twitter.com/i/web/status/784460833912975360', u'twitter.com'),
 (u'https://twitter.com/i/web/status/792218124707729408', u'twitter.com'),
 (u'https://twitter.com/CaptainCreole/status/798946586730659840',
  u'twitter.com'),
 (u'https://twitter.com/tazerblack/status/786997527560224769', u'twitter.com')]

Run code to get all URLs

with open("all_urls.txt", "wb+") as fp:
    for url in sorted(filter(lambda x: x[1] != 'twitter.com',
            unique_urls),
                      key=lambda x: url_types[x[1]],
                     reverse=True):
        print >> fp, "%s\t%s\t%s" % (url[0], url[1], url_types[url[1]])

! head all_urls.txt

# python download_expanded.py --jobs 20 --batches 200 # Run this to expand URLs

In [6]:
! head exp_urls.txt


http://fb.me/6Ry198BOC	http://fb.me/6Ry198BOC
http://fb.me/8Z6wy6V1o	http://worldtruth.tv/body-of-doctor-who-linked-vaccines-to-autism-found-floating-in-river/
http://fb.me/2xq5ADQaW	http://www.trueactivist.com/courts-quietly-confirm-mmr-vaccine-causes-autism/
http://fb.me/1erJaFNs1	http://www.theblaze.com/stories/2016/05/31/12-year-old-science-whiz-gathers-and-shares-all-the-evidence-that-vaccines-cause-autism/?utm_source=facebook&utm_medium=story&utm_campaign=ShareButtons
http://fb.me/7QrKP6H94	http://fb.me/7QrKP6H94
http://fb.me/3Vqc5yhbx	http://www.lifenews.com/2014/09/09/study-links-autism-to-vaccines-made-with-cells-from-aborted-babies/#.V9rBbPGPpHM.facebook
http://fb.me/5hVmXkvZT	http://tylervigen.com/spurious-correlations
http://fb.me/4oBG06FuV	https://www.facebook.com/photo.php?fbid=1066132846768368
http://fb.me/7IkGrAXn9	http://www.dailymail.co.uk/news/article-3141287/Authorities-Anti-vaccine-doctor-dead-apparent-suicide.html
http://fb.me/1qV1jUl3P	http://www.npr.org/sections/health-shots/2016/11/28/503592933/flu-vaccine-during-pregnancy-not-linked-to-autism?utm_source=facebook.com&utm_medium=social&utm_campaign=npr&utm_term=nprnews&utm_content=2055

In [7]:
data[0].keys()


Out[7]:
[u'contributors',
 u'truncated',
 u'text',
 u'is_quote_status',
 u'in_reply_to_status_id',
 u'id',
 u'favorite_count',
 u'source',
 u'quoted_status_id',
 u'retweeted',
 u'coordinates',
 u'quoted_status',
 u'entities',
 u'in_reply_to_screen_name',
 u'id_str',
 u'retweet_count',
 u'in_reply_to_user_id',
 u'favorited',
 u'user',
 u'geo',
 u'in_reply_to_user_id_str',
 u'possibly_sensitive',
 u'lang',
 u'created_at',
 u'quoted_status_id_str',
 u'in_reply_to_status_id_str',
 u'place']

In [8]:
data[0][u'source']


Out[8]:
u'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>'

In [9]:
data[0][u'is_quote_status']


Out[9]:
True

In [10]:
data[0][u'quoted_status']['text']


Out[10]:
u'Overnight apartment fire in Tampa #10News https://t.co/gDBsG8udFg'

In [11]:
data[0]['text']


Out[11]:
u'Getting a better look at the damage now that the sun is up.  Very sad https://t.co/DZrhrubgf9'

In [12]:
count_quoted = 0
has_coordinates = 0
count_replies = 0
language_ids = defaultdict(int)
count_user_locs = 0
user_locs = Counter()
count_verified = 0
for d in data:
    count_quoted += d.get('is_quote_status', 0)
    coords = d.get(u'coordinates', None)
    repl_id = d.get(u'in_reply_to_status_id', None)
    has_coordinates += (coords is not None)
    count_replies += (repl_id is not None)
    loc = d['user'].get('location', u'')
    count_verified += d['user']['verified']
    if loc != u'':
        count_user_locs += 1
        user_locs.update([loc])
    language_ids[d['lang']] += 1
    
print count_quoted, has_coordinates, count_replies, count_user_locs, count_verified


21382 646 53296 281811 11366

In [13]:
count_verified


Out[13]:
11366

In [14]:
user_locs.most_common(10)


Out[14]:
[(u'United States', 10420),
 (u'USA', 7880),
 (u'Washington, DC', 4310),
 (u'New York, NY', 3082),
 (u'California, USA', 3018),
 (u'Los Angeles, CA', 2719),
 (u'New York', 2312),
 (u'Chicago, IL', 2179),
 (u'New York, USA', 2021),
 (u'Texas', 1773)]

In [15]:
len(data)


Out[15]:
328318

In [16]:
data[0]['user']


Out[16]:
{u'contributors_enabled': False,
 u'created_at': u'Tue Jul 14 00:13:13 +0000 2009',
 u'default_profile': False,
 u'default_profile_image': False,
 u'description': u'Executive Producer at 10News WTSP in Tampa/St. Petersburg. Indiana University graduate.',
 u'entities': {u'description': {u'urls': []}},
 u'favourites_count': 345,
 u'follow_request_sent': False,
 u'followers_count': 573,
 u'following': False,
 u'friends_count': 503,
 u'geo_enabled': True,
 u'has_extended_profile': False,
 u'id': 56544119,
 u'id_str': u'56544119',
 u'is_translation_enabled': False,
 u'is_translator': False,
 u'lang': u'en',
 u'listed_count': 68,
 u'location': u'St. Petersburg',
 u'name': u'Melissa Ramsey',
 u'notifications': False,
 u'profile_background_color': u'0099B9',
 u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme4/bg.gif',
 u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme4/bg.gif',
 u'profile_background_tile': False,
 u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/56544119/1443718335',
 u'profile_image_url': u'http://pbs.twimg.com/profile_images/743866585635491840/Pa-vBAru_normal.jpg',
 u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/743866585635491840/Pa-vBAru_normal.jpg',
 u'profile_link_color': u'0099B9',
 u'profile_sidebar_border_color': u'5ED4DC',
 u'profile_sidebar_fill_color': u'95E8EC',
 u'profile_text_color': u'3C3940',
 u'profile_use_background_image': True,
 u'protected': False,
 u'screen_name': u'mramsey8',
 u'statuses_count': 1010,
 u'time_zone': u'Central Time (US & Canada)',
 u'translator_type': u'none',
 u'url': None,
 u'utc_offset': -21600,
 u'verified': False}

In [ ]: